# load packages, installing if missing
if (!require(librarian)){
  install.packages("librarian")
  library(librarian)
} 
## Loading required package: librarian
librarian::shelf(
  dismo, dplyr, DT, ggplot2, here, htmltools, leaflet, mapview, purrr, raster, readr, rgbif, rgdal, rJava, sdmpredictors, sf, spocc, tidyr, geojsonio)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select

# set random seed for reproducibility
set.seed(42)

# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F)

Get Species Observations

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")
redo    <- FALSE

if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
    query = 'Giraffa',
    #query = 'Panthera uncia', 
    from = 'gbif', has_coords = T, limit = 10000))
  
  # extract data frame from result
  df <- res$gbif$data[[1]] 
  readr::write_csv(df, obs_csv)
  
  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) # save space (joinable from obs_csv)
  sf::write_sf(obs, obs_geo, delete_dsn=T)
}
obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9289
# convert to points of observation from lon/lat columns in data frame
# obs <- df %>% 
#  sf::st_as_sf(
#    coords = c("longitude", "latitude"),
 #   crs = st_crs(4326))

#readr::write_csv(df, obs_csv)
#geojson_write(obs, obs_geo)
# show points on map
mapview::mapview(obs, map.types = "Stamen.Terrain")

Get Environmental Data

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)
# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", "WC_bio1", "WC_bio2", "ER_tri", "ER_topoWet")

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc=2)

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite=T)  
}
env_stack <- stack(env_stack_grd)

# show map
# mapview(obs) + 
#   mapview(env_stack, hide = T) # makes html too big for Github
plot(env_stack, nc=2)

Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # show map
  # mapview(obs) + 
  #   mapview(r_obs)
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn=T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
## Warning in cbind(`Feature ID` = fid, mat): number of rows of result is not a
## multiple of vector length (arg 1)
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(
        present = 1) %>% 
      select(present, key),
    absence %>% 
      mutate(
        present = 0,
        key     = NA)) %>% 
    mutate(
      ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn=T)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df=TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(
      pts %>% 
        select(ID, present),
      by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(
      #present = factor(present),
      lon = st_coordinates(geometry)[,1],
      lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)
## Rows: 18578 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): ID, present, WC_alt, WC_bio1, WC_bio2, ER_tri, ER_topoWet, lon, lat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(
    rownames = F,
    options = list(
      dom = "t",
      pageLength = 20))

Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(
    present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(
    legend.position = c(1, 0),
    legend.justification = c(1, 0))
## Warning: Removed 147 rows containing non-finite values (stat_density).

Lab 1b. Species Distribution Modeling - Logistic Regression

Explore (cont’d)

librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

pts_env <- read_csv(pts_env_csv)
## Rows: 18578 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): ID, present, WC_alt, WC_bio1, WC_bio2, ER_tri, ER_topoWet, lon, lat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(pts_env)
## [1] 18578
datatable(pts_env, rownames = F)
GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 45 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 50 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 32 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 50 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 32 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 50 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 32 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning: Removed 45 rows containing missing values (geom_point).
## Warning: Removed 50 rows containing missing values (geom_point).

## Warning: Removed 50 rows containing missing values (geom_point).

## Warning: Removed 50 rows containing missing values (geom_point).
## Warning: Removed 45 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 47 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 45 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 45 rows containing missing values
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 32 rows containing missing values (geom_point).

## Warning: Removed 32 rows containing missing values (geom_point).

## Warning: Removed 32 rows containing missing values (geom_point).
## Warning: Removed 47 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 27 rows containing missing values
## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 45 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).

## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 45 rows containing missing values (geom_point).
## Warning: Removed 27 rows containing missing values (geom_point).

Logistic Regression

# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 18526

Linear Model

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1249 -0.3231  0.1251  0.3013  1.4554 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.904e-01  4.147e-02  11.825  < 2e-16 ***
## WC_alt       7.516e-05  6.148e-06  12.224  < 2e-16 ***
## WC_bio1      5.169e-03  7.314e-04   7.066 1.65e-12 ***
## WC_bio2      3.692e-02  1.360e-03  27.153  < 2e-16 ***
## ER_tri      -3.093e-03  1.594e-04 -19.401  < 2e-16 ***
## ER_topoWet  -4.921e-02  3.167e-03 -15.537  < 2e-16 ***
## lon          2.834e-03  6.435e-05  44.045  < 2e-16 ***
## lat         -1.224e-02  1.541e-04 -79.415  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3808 on 18518 degrees of freedom
## Multiple R-squared:  0.4202, Adjusted R-squared:  0.4199 
## F-statistic:  1917 on 7 and 18518 DF,  p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true    <- d$present

range(y_predict)
## [1] -0.6636976  1.1248851
range(y_true)
## [1] 0 1

Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0108  -0.4446  -0.0004   0.6566   5.4968  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.362e+00  4.379e-01  -14.53   <2e-16 ***
## WC_alt       2.220e-03  7.351e-05   30.20   <2e-16 ***
## WC_bio1      3.728e-01  1.214e-02   30.71   <2e-16 ***
## WC_bio2      2.150e-01  1.105e-02   19.45   <2e-16 ***
## ER_tri      -3.367e-02  1.995e-03  -16.88   <2e-16 ***
## ER_topoWet  -5.579e-01  3.139e-02  -17.77   <2e-16 ***
## lon          3.414e-02  8.102e-04   42.13   <2e-16 ***
## lat         -1.128e-01  1.972e-03  -57.20   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25682  on 18525  degrees of freedom
## Residual deviance: 13907  on 18518  degrees of freedom
## AIC: 13923
## 
## Number of Fisher Scoring iterations: 7
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 6.769232e-08 9.892462e-01
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim = "free")

Generalized Additive Model

librarian::shelf(mgcv)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -11.82       2.29   -5.16 2.47e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df  Chi.sq p-value    
## s(WC_alt)     6.979  7.170   78.75  <2e-16 ***
## s(WC_bio1)    6.937  7.058  287.85  <2e-16 ***
## s(WC_bio2)    8.906  8.996  505.59  <2e-16 ***
## s(ER_tri)     6.500  7.227   60.94  <2e-16 ***
## s(ER_topoWet) 8.928  8.994  218.83  <2e-16 ***
## s(lon)        8.983  9.000  771.16  <2e-16 ***
## s(lat)        8.982  9.000 1261.13  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.812   Deviance explained =   75%
## UBRE = -0.64784  Scale est. = 1         n = 18526
# show term plots
plot(mdl, scale=0)

Maxent (Maximum Entropy)

# load extra packages
librarian::shelf(
  maptools, sf)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")

# show version of maxent
if (!interactive())
maxent()
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
  mdl <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)
# plot variable contributions per predictor
plot(mdl)

# plot term plots
response(mdl)

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

# Lab 1c. Species Distribution Modeling - Decision Trees

1. Decision Trees

# global knitr chunk options
knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE)

# load packages
librarian::shelf(
  caret,       # m: modeling framework
  dplyr, ggplot2 ,here, readr, 
  pdp,         # X: partial dependence plots
  rpart,       # m: recursive partition modeling
  rpart.plot,  # m: recursive partition plotting
  rsample,     # d: split train/test data
  skimr,       # d: skim summarize data table
  vip)         # X: variable importance
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# options
options(
  scipen = 999,
  readr.show_col_types = F)
set.seed(42)

# graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# paths
dir_data    <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

# read data
pts_env <- read_csv(pts_env_csv)
## Rows: 18578 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (9): ID, present, WC_alt, WC_bio1, WC_bio2, ER_tri, ER_topoWet, lon, lat
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d <- pts_env %>% 
  select(-ID) %>%                   # not used as a predictor x
  mutate(
    present = factor(present)) %>%  # categorical response
  na.omit()                         # drop rows with NA
skim(d)
Data summary
Name d
Number of rows 18526
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9264, 1: 9262

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 871.95 783.23 -379.00 285.00 725.00 1209.00 5810.00 ▇▆▁▁▁
WC_bio1 0 1 20.97 5.87 -10.90 18.80 22.20 24.60 30.50 ▁▁▂▇▇
WC_bio2 0 1 13.32 2.53 4.70 11.60 13.60 15.40 20.40 ▁▃▇▆▁
ER_tri 0 1 23.24 32.24 0.00 5.37 12.31 33.94 368.11 ▇▁▁▁▁
ER_topoWet 0 1 12.00 1.63 6.31 11.27 12.18 12.99 15.86 ▁▂▇▇▂
lon 0 1 19.09 45.04 -121.21 12.54 31.71 36.99 136.96 ▂▂▇▇▁
lat 0 1 3.91 20.31 -34.38 -9.68 -0.47 19.21 52.15 ▅▇▅▅▂

1.1 Split data into training and testing

# create training set with 80% of full data
d_split  <- rsample::initial_split(d, prop = 0.8, strata = "present")
d_train  <- rsample::training(d_split)

# show number of rows present is 0 vs 1
table(d$present)
## 
##    0    1 
## 9264 9262
table(d_train$present)
## 
##    0    1 
## 7411 7409

1.2 Partition, depth=1

# run decision stump model
mdl <- rpart(
  present ~ ., data = d_train, 
  control = list(
    cp = 0, minbucket = 5, maxdepth = 1))
mdl
## n= 14820 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 14820 7409 0 (0.50006748 0.49993252)  
##   2) lat>=4.34595 5670  566 0 (0.90017637 0.09982363) *
##   3) lat< 4.34595 9150 2307 1 (0.25213115 0.74786885) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl)

## 1.3 Partition, depth=default

# decision tree with defaults
mdl <- rpart(present ~ ., data = d_train)
mdl
## n= 14820 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 14820 7409 0 (0.500067476 0.499932524)  
##    2) lat>=4.34595 5670  566 0 (0.900176367 0.099823633)  
##      4) WC_bio1< 28.75 5059  121 0 (0.976082230 0.023917770) *
##      5) WC_bio1>=28.75 611  166 1 (0.271685761 0.728314239)  
##       10) lat>=13.60831 162   13 0 (0.919753086 0.080246914) *
##       11) lat< 13.60831 449   17 1 (0.037861915 0.962138085) *
##    3) lat< 4.34595 9150 2307 1 (0.252131148 0.747868852)  
##      6) lon< 13.52417 1306    9 0 (0.993108729 0.006891271) *
##      7) lon>=13.52417 7844 1010 1 (0.128760836 0.871239164)  
##       14) lon< 30.71002 2071  672 1 (0.324480927 0.675519073)  
##         28) lat>=-15.83333 499  108 0 (0.783567134 0.216432866) *
##         29) lat< -15.83333 1572  281 1 (0.178753181 0.821246819) *
##       15) lon>=30.71002 5773  338 1 (0.058548415 0.941451585) *
rpart.plot(mdl)

# plot complexity parameter
plotcp(mdl)

# rpart cross validation results
mdl$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.61222837      0 1.0000000 1.0194358 0.008213980
## 2 0.17384262      1 0.3877716 0.3958699 0.006546489
## 3 0.03765690      2 0.2139290 0.2181131 0.005121412
## 4 0.01909839      3 0.1762721 0.1838305 0.004746736
## 5 0.01835605      5 0.1380753 0.1479282 0.004299935
## 6 0.01000000      6 0.1197193 0.1226886 0.003942552

1.4 Feature interpretation

# caret cross validation results
mdl_caret <- train(
  present ~ .,
  data       = d_train,
  method     = "rpart",
  trControl  = trainControl(method = "cv", number = 10),
  tuneLength = 20)

ggplot(mdl_caret)

vip(mdl_caret, num_features = 40, bar = FALSE)

# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>% 
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE, 
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

2. Random Forests

# load additional packages
librarian::shelf(
  ranger)  # random forest modeling

2.1 Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.1439279

2.2 Feature Interpretation

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(
  present ~ ., data = d_train,
  importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(
  present ~ ., data = d_train,
  importance = "permutation")
p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

Lab 1d. Species Distribution Modeling - Evaluate Models

1. Setup

# global knitr chunk options
knitr::opts_chunk$set(
  warning = FALSE, 
  message = FALSE)

# load packages
librarian::shelf(
  dismo, # species distribution modeling: maxent(), predict(), evaluate(), 
  dplyr, ggplot2, GGally, here, maptools, readr, 
  raster, readr, rsample, sf,
  usdm)  # uncertainty analysis for species distribution models: vifcor()
select = dplyr::select

# options
set.seed(42)
options(
  scipen = 999,
  readr.show_col_types = F)
ggplot2::theme_set(ggplot2::theme_light())

# paths
dir_data      <- here("data/sdm")
pts_geo       <- file.path(dir_data, "pts.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")
mdl_maxv_rds  <- file.path(dir_data, "mdl_maxent_vif.rds")

# read points of observation: presence (1) and absence (0)
pts <- read_sf(pts_geo)

# read raster stack of environment
env_stack <- raster::stack(env_stack_grd)

1.1 Split observations into training and testing

# create training set with 80% of full data
pts_split  <- rsample::initial_split(
  pts, prop = 0.8, strata = "present")
pts_train  <- rsample::training(pts_split)
pts_test   <- rsample::testing(pts_split)

pts_train_p <- pts_train %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_train_a <- pts_train %>% 
  filter(present == 0) %>% 
  as_Spatial()

2 Calibrate: Model Selection

# show pairs plot before multicollinearity reduction with vifcor()
pairs(env_stack)

# calculate variance inflation factor per predictor, a metric of multicollinearity between variables

vif(env_stack)
##    Variables      VIF
## 1     WC_alt 2.958356
## 2    WC_bio1 2.035343
## 3    WC_bio2 1.391806
## 4     ER_tri 3.573421
## 5 ER_topoWet 3.564885
# stepwise reduce predictors, based on a max correlation of 0.7 (max 1)
v <- vifcor(env_stack, th=0.7) 
v
## 1 variables from the 5 input variables have collinearity problem: 
##  
## ER_topoWet 
## 
## After excluding the collinear variables, the linear correlation coefficients ranges between: 
## min correlation ( WC_bio2 ~ WC_bio1 ):  -0.009173593 
## max correlation ( WC_bio1 ~ WC_alt ):  -0.6939305 
## 
## ---------- VIFs of the remained variables -------- 
##   Variables      VIF
## 1    WC_alt 3.104773
## 2   WC_bio1 2.128335
## 3   WC_bio2 1.418265
## 4    ER_tri 1.671422
# reduce enviromental raster stack by 
env_stack_v <- usdm::exclude(env_stack, v)

# show pairs plot after multicollinearity reduction with vifcor()
pairs(env_stack_v)

# fit a maximum entropy model
if (!file.exists(mdl_maxv_rds)){
  mdl_maxv <- maxent(env_stack_v, sf::as_Spatial(pts_train))
  readr::write_rds(mdl_maxv, mdl_maxv_rds)
}
mdl_maxv <- read_rds(mdl_maxv_rds)

# plot variable contributions per predictor
plot(mdl_maxv)

# plot term plots
response(mdl_maxv)

# predict
y_maxv <- predict(env_stack, mdl_maxv) #, ext=ext, progress='')

plot(y_maxv, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

3 Evaluate: Model Performance

pts_test_p <- pts_test %>% 
  filter(present == 1) %>% 
  as_Spatial()
pts_test_a <- pts_test %>% 
  filter(present == 0) %>% 
  as_Spatial()

y_maxv <- predict(mdl_maxv, env_stack)
#plot(y_maxv)

e <- dismo::evaluate(
  p     = pts_test_p,
  a     = pts_test_a, 
  model = mdl_maxv,
  x     = env_stack)
e
## class          : ModelEvaluation 
## n presences    : 1852 
## n absences     : 1854 
## AUC            : 0.8443008 
## cor            : 0.5991451 
## max TPR+TNR at : 0.6511196
plot(e, 'ROC')

thr <- threshold(e)[['spec_sens']]
thr
## [1] 0.6511196
p_true <- na.omit(raster::extract(y_maxv, pts_test_p) >= thr)
a_true <- na.omit(raster::extract(y_maxv, pts_test_a) < thr)

# (t)rue/(f)alse (p)ositive/(n)egative rates
tpr <- sum(p_true)/length(p_true)
fnr <- sum(!p_true)/length(p_true)
fpr <- sum(!a_true)/length(a_true)
tnr <- sum(a_true)/length(a_true)

matrix(
  c(tpr, fnr,
    fpr, tnr), 
  nrow=2, dimnames = list(
    c("present_obs", "absent_obs"),
    c("present_pred", "absent_pred")))
##             present_pred absent_pred
## present_obs    0.7462203   0.2065804
## absent_obs     0.2537797   0.7934196
# add point to ROC plot
plot(e, 'ROC')
points(fpr, tpr, pch=23, bg="blue")

plot(y_maxv > thr)